import plotly
plotly.offline.init_notebook_mode()
Problem is the risk analysis of patients with diabetes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
# Load the diabetes dataset
data_diabetes = datasets.load_diabetes()
data_diabetes
{'data': array([[ 0.03807591, 0.05068012, 0.06169621, ..., -0.00259226,
0.01990749, -0.01764613],
[-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
-0.06833155, -0.09220405],
[ 0.08529891, 0.05068012, 0.04445121, ..., -0.00259226,
0.00286131, -0.02593034],
...,
[ 0.04170844, 0.05068012, -0.01590626, ..., -0.01107952,
-0.04688253, 0.01549073],
[-0.04547248, -0.04464164, 0.03906215, ..., 0.02655962,
0.04452873, -0.02593034],
[-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
-0.00422151, 0.00306441]]),
'target': array([151., 75., 141., 206., 135., 97., 138., 63., 110., 310., 101.,
69., 179., 185., 118., 171., 166., 144., 97., 168., 68., 49.,
68., 245., 184., 202., 137., 85., 131., 283., 129., 59., 341.,
87., 65., 102., 265., 276., 252., 90., 100., 55., 61., 92.,
259., 53., 190., 142., 75., 142., 155., 225., 59., 104., 182.,
128., 52., 37., 170., 170., 61., 144., 52., 128., 71., 163.,
150., 97., 160., 178., 48., 270., 202., 111., 85., 42., 170.,
200., 252., 113., 143., 51., 52., 210., 65., 141., 55., 134.,
42., 111., 98., 164., 48., 96., 90., 162., 150., 279., 92.,
83., 128., 102., 302., 198., 95., 53., 134., 144., 232., 81.,
104., 59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
173., 180., 84., 121., 161., 99., 109., 115., 268., 274., 158.,
107., 83., 103., 272., 85., 280., 336., 281., 118., 317., 235.,
60., 174., 259., 178., 128., 96., 126., 288., 88., 292., 71.,
197., 186., 25., 84., 96., 195., 53., 217., 172., 131., 214.,
59., 70., 220., 268., 152., 47., 74., 295., 101., 151., 127.,
237., 225., 81., 151., 107., 64., 138., 185., 265., 101., 137.,
143., 141., 79., 292., 178., 91., 116., 86., 122., 72., 129.,
142., 90., 158., 39., 196., 222., 277., 99., 196., 202., 155.,
77., 191., 70., 73., 49., 65., 263., 248., 296., 214., 185.,
78., 93., 252., 150., 77., 208., 77., 108., 160., 53., 220.,
154., 259., 90., 246., 124., 67., 72., 257., 262., 275., 177.,
71., 47., 187., 125., 78., 51., 258., 215., 303., 243., 91.,
150., 310., 153., 346., 63., 89., 50., 39., 103., 308., 116.,
145., 74., 45., 115., 264., 87., 202., 127., 182., 241., 66.,
94., 283., 64., 102., 200., 265., 94., 230., 181., 156., 233.,
60., 219., 80., 68., 332., 248., 84., 200., 55., 85., 89.,
31., 129., 83., 275., 65., 198., 236., 253., 124., 44., 172.,
114., 142., 109., 180., 144., 163., 147., 97., 220., 190., 109.,
191., 122., 230., 242., 248., 249., 192., 131., 237., 78., 135.,
244., 199., 270., 164., 72., 96., 306., 91., 214., 95., 216.,
263., 178., 113., 200., 139., 139., 88., 148., 88., 243., 71.,
77., 109., 272., 60., 54., 221., 90., 311., 281., 182., 321.,
58., 262., 206., 233., 242., 123., 167., 63., 197., 71., 168.,
140., 217., 121., 235., 245., 40., 52., 104., 132., 88., 69.,
219., 72., 201., 110., 51., 277., 63., 118., 69., 273., 258.,
43., 198., 242., 232., 175., 93., 168., 275., 293., 281., 72.,
140., 189., 181., 209., 136., 261., 113., 131., 174., 257., 55.,
84., 42., 146., 212., 233., 91., 111., 152., 120., 67., 310.,
94., 183., 66., 173., 72., 49., 64., 48., 178., 104., 132.,
220., 57.]),
'frame': None,
'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n :Number of Instances: 442\n\n :Number of Attributes: First 10 columns are numeric predictive values\n\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n :Attribute Information:\n - age age in years\n - sex\n - bmi body mass index\n - bp average blood pressure\n - s1 tc, total serum cholesterol\n - s2 ldl, low-density lipoproteins\n - s3 hdl, high-density lipoproteins\n - s4 tch, total cholesterol / HDL\n - s5 ltg, possibly log of serum triglycerides level\n - s6 glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n',
'feature_names': ['age',
'sex',
'bmi',
'bp',
's1',
's2',
's3',
's4',
's5',
's6'],
'data_filename': 'diabetes_data_raw.csv.gz',
'target_filename': 'diabetes_target.csv.gz',
'data_module': 'sklearn.datasets.data'}
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['target'] = data_diabetes.target
df_diabetes.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
| 1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
| 2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
| 3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
| 4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
df_describe = df_diabetes.describe()
df_describe
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 442.000000 |
| mean | -2.511817e-19 | 1.230790e-17 | -2.245564e-16 | -4.797570e-17 | -1.381499e-17 | 3.918434e-17 | -5.777179e-18 | -9.042540e-18 | 9.293722e-17 | 1.130318e-17 | 152.133484 |
| std | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 77.093005 |
| min | -1.072256e-01 | -4.464164e-02 | -9.027530e-02 | -1.123988e-01 | -1.267807e-01 | -1.156131e-01 | -1.023071e-01 | -7.639450e-02 | -1.260971e-01 | -1.377672e-01 | 25.000000 |
| 25% | -3.729927e-02 | -4.464164e-02 | -3.422907e-02 | -3.665608e-02 | -3.424784e-02 | -3.035840e-02 | -3.511716e-02 | -3.949338e-02 | -3.324559e-02 | -3.317903e-02 | 87.000000 |
| 50% | 5.383060e-03 | -4.464164e-02 | -7.283766e-03 | -5.670422e-03 | -4.320866e-03 | -3.819065e-03 | -6.584468e-03 | -2.592262e-03 | -1.947171e-03 | -1.077698e-03 | 140.500000 |
| 75% | 3.807591e-02 | 5.068012e-02 | 3.124802e-02 | 3.564379e-02 | 2.835801e-02 | 2.984439e-02 | 2.931150e-02 | 3.430886e-02 | 3.243232e-02 | 2.791705e-02 | 211.500000 |
| max | 1.107267e-01 | 5.068012e-02 | 1.705552e-01 | 1.320436e-01 | 1.539137e-01 | 1.987880e-01 | 1.811791e-01 | 1.852344e-01 | 1.335973e-01 | 1.356118e-01 | 346.000000 |
df_diabetes.hist(figsize=(12,10))
plt.show()
df_diabetes_corr = df_diabetes.corr()
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Correlation Matrix of Diabetes Dataset')
plt.show()
from sklearn.model_selection import train_test_split
# We are taking only BMI & Taget columns because it has already stated that
# BMI is the independent variable and tage is the dependent variable.
X = df_diabetes[['bmi']]
y= df_diabetes['target']
X_train, X_old, y_train, y_old = train_test_split(X, y, test_size=0.3)
X_val,X_test,y_val,y_test = train_test_split(X_old, y_old, test_size=0.5)
print(X_train.shape, X_val.shape, X_test.shape)
(309, 1) (66, 1) (67, 1)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def create_poly_model(X,y,degrees):
models = {}
for degree in degrees:
model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())])
model.fit(X, y)
models[degree] = model
return models
degrees = list(range(0, 6))
models = create_poly_model(X_train, y_train, degrees)
# print models
for degree, model in models.items():
print(f'Degree: {degree}')
print(f'Model: {model}\n')
Degree: 0
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
('linear', LinearRegression())])
Degree: 1
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
('linear', LinearRegression())])
Degree: 2
Model: Pipeline(steps=[('polynomial', PolynomialFeatures()),
('linear', LinearRegression())])
Degree: 3
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
('linear', LinearRegression())])
Degree: 4
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
('linear', LinearRegression())])
Degree: 5
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
('linear', LinearRegression())])
from sklearn.metrics import r2_score, mean_absolute_error
# MAPE function
def mape(y_act, y_pred):
return np.mean(np.abs((y_act - y_pred) / y_act)) * 100
for degree, model in models.items():
# Predictions for train and validation set
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
print(f'Degree: {degree}')
# Train Data
print(f' Train R2: {r2_score(y_train, y_train_pred)}')
print(f' Train MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f' Train MAPE: {mape(y_train, y_train_pred)}\n')
# Validation Data
print(f' Val R2: {r2_score(y_val, y_val_pred)}')
print(f' Val MAE: {mean_absolute_error(y_val, y_val_pred)}')
print(f' Val MAPE: {mape(y_val, y_val_pred)}\n')
Degree: 0
Train R2: 0.0
Train MAE: 67.57155873943508
Train MAPE: 63.22079557268291
Val R2: -0.015216189178788975
Val MAE: 53.33965872315387
Val MAPE: 49.74814387824455
Degree: 1
Train R2: 0.39506875105645234
Train MAE: 49.813871472226914
Train MAPE: 46.20225038336912
Val R2: -0.0409139094701958
Val MAE: 56.13393800785215
Val MAPE: 47.957769381130454
Degree: 2
Train R2: 0.39523244356146825
Train MAE: 49.8067540678438
Train MAPE: 46.17728171872351
Val R2: -0.03633759992609775
Val MAE: 56.000101186245885
Val MAPE: 47.7653520590214
Degree: 3
Train R2: 0.4015699487833455
Train MAE: 49.12688599328037
Train MAPE: 45.45320670325321
Val R2: -0.050041290167614605
Val MAE: 55.85397765617027
Val MAPE: 47.03024581498172
Degree: 4
Train R2: 0.40166108458559047
Train MAE: 49.13235825623285
Train MAPE: 45.414285542233266
Val R2: -0.04880159279634455
Val MAE: 55.78119348378122
Val MAPE: 46.98637439333626
Degree: 5
Train R2: 0.4039026126528047
Train MAE: 49.040414808408684
Train MAPE: 45.29869386436929
Val R2: -0.049438128656875246
Val MAE: 55.89220579526998
Val MAPE: 47.18751189682845
from sklearn.metrics import mean_squared_error
models[1].fit(X_test,y_test) # 5 is the degree of the polynomial model in the models dictionary
y_test_pred = models[1].predict(X_test)
# Evaluating the model with test data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAE: {mae_test}')
Test Data R2 score: 0.3318660414578588 MSE: 4245.949185370177 MAE: 55.29487357659975
# Plotting the model
plt.figure(figsize=(10,8))
plt.scatter(X_train, y_train, color='blue', label='Train Data')
plt.scatter(X_val, y_val, color='red', label='Validation Data')
plt.scatter(X_test, y_test, color='green', label='Test Data')
plt.plot(X_test, y_test_pred, color='black', label='Degree 1 test data')
plt.plot(X_train, models[1].predict(X_train), color='yellow', label='Degree 1 train data')
plt.plot(X_val, models[1].predict(X_val), color='orange', label='Degree 1 val data')
plt.title('Polynomial Regression with model degree 1')
plt.xlabel('BMI')
plt.ylabel('Target')
plt.legend()
plt.show()
def print_pipeline_model_stats(model):
# print model
print(f'Model: {model}')
print(f'Coefficients: {model[-1].coef_}')
print(f'Intercept: {model[-1].intercept_}')
# generate equation string:
equation = 'y = '
for i, coef in enumerate(model[-1].coef_):
equation += f'{coef:.2f} * x^{i} + '
equation += f'{model[-1].intercept_:.2f}'
print(f'Equation: {equation}')
print_pipeline_model_stats(models[1])
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
('linear', LinearRegression())])
Coefficients: [ 0. 923.27355022]
Intercept: 153.73795864250303
Equation: y = 0.00 * x^0 + 923.27 * x^1 + 153.74
bmi_manual = 0.05
y_pred_manual = 923.27 * (bmi_manual)**1 + 153.74
print(y_pred_manual)
y_model = models[1].predict([[bmi_manual]])
print(y_model)
199.9035 [199.90163615]
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
trainable_params = {}
for degree in range(6):
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X_train)
params_count = X_poly.shape[1]
trainable_params[degree] = params_count
print(f'Degree {degree}: {poly.get_feature_names_out()}')
degrees = list(trainable_params.keys())
params = list(trainable_params.values())
print('Degrees:', degrees)
print('Trainable Parameters:', params)
Degree 0: ['1'] Degree 1: ['1' 'bmi'] Degree 2: ['1' 'bmi' 'bmi^2'] Degree 3: ['1' 'bmi' 'bmi^2' 'bmi^3'] Degree 4: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4'] Degree 5: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4' 'bmi^5'] Degrees: [0, 1, 2, 3, 4, 5] Trainable Parameters: [1, 2, 3, 4, 5, 6]